Multiple imputation in R using mice

CSP 2021

Emile Latour

February 17, 2021

Load the packages

Here we load that packages that we will be using during the analysis. See section titled “Links and references” for more info on the packages

library(tidyverse)       # packages ggplot2, dplyr, tidyr, readr, purrr, tibble, 
                         # stringr, and forcats
library(broom)           # functions tidy(), glance(), augment()
library(janitor)         # for working with dirty data 
library(here)            # simpler way to find your files
library(skimr)           # Compact and Flexible Summaries of Data
library(DT)              # Helpful for looking at large data frames

library(mice)            # Multiple imputation using Fully Conditional Specification
library(naniar)          # structures, summaries, and visualizations for missing data 
library(VIM)             # Visualization and Imputation of Missing Values
library(finalfit)        # Check for associations between missing and observed data.

The data

A simulated data set of Medicaid enrollment data that has been matched with Electronic Health Records (EHR) for the purposes of comparing the Medicaid claims data with the EHR to ensure accuracy/agreement.

The data contains:

Based on real data

To provide a practical real-world example, he data used here was based on an actual data set, but due to complications with using patient data and privacy, all data here has been simulated and similarities to the original data are unintentional.

Data preparation

Load the data

Using the here package to construct the path to the data set. Specifying the column types ahead of time.

data <- readr::read_csv(
  file = here::here("data", 
                    "ehr-and-claims_simulated-data_csp-2021.csv"), 
  col_types = cols(pat_id = col_character(),
                   age = col_double(),
                   sex = col_character(),
                   race_eth = col_character(),
                   language = col_character(),
                   fpl = col_character(),
                   age_f = col_character(),
                   breast_eligibility = col_double(),
                   flu_eligibility = col_double(),
                   cholesterol_eligibility = col_double(),
                   breast_claims = col_double(),
                   flu_claims = col_double(),
                   cholesterol_claims = col_double(),
                   breast_ehr = col_double(),
                   flu_ehr = col_double(),
                   cholesterol_ehr = col_double(), 
                   cholesterol_total = col_double()))

Take a peak

Take note of

dplyr::glimpse(data)
## Rows: 13,101
## Columns: 17
## $ pat_id                  <chr> "05094", "05711", "01013", "11608", "12443", …
## $ age                     <dbl> NA, 48, 26, 59, 61, 34, 54, 59, 50, 40, NA, 6…
## $ sex                     <chr> "Female", "Female", "Female", "Male", "Female…
## $ race_eth                <chr> "Non-Hispanic, White", "Non-Hispanic, White",…
## $ language                <chr> "English", "English", "English", "English", "…
## $ fpl                     <chr> "<=138% FPL", "<=138% FPL", "<=138% FPL", "<=…
## $ age_f                   <chr> NA, "35-<50", "19-<34", "51-<64", "51-<64", "…
## $ breast_eligibility      <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, …
## $ flu_eligibility         <dbl> 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, …
## $ cholesterol_eligibility <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ breast_claims           <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ flu_claims              <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ cholesterol_claims      <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, …
## $ breast_ehr              <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ flu_ehr                 <dbl> 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ cholesterol_ehr         <dbl> 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, …
## $ cholesterol_total       <dbl> 152.9439, NA, NA, 193.5783, NA, 226.7804, 244…

Light data cleaning

Convert coded variables to factor type.

data <- data %>% 
  mutate(sex = factor(sex, 
                      levels = c("Female", "Male")), 
         race_eth = factor(race_eth, 
                           levels = c("Hispanic",
                                      "Non-Hispanic, Black",
                                      "Non-Hispanic, Other",
                                      "Non-Hispanic, White")),
         language = factor(language, 
                           levels = c("English", 
                                      "Spanish", 
                                      "Other")),
         fpl = factor(fpl, 
                      levels = c("<=138% FPL", 
                                 ">138% FPL")), 
         age_f = factor(age_f, 
                        levels = c("19-<34", 
                                   "35-<50", 
                                   "51-<64")), 
         dplyr::across(.cols = dplyr::ends_with("_eligibility"), 
                       .fns = ~ factor(., 
                                       levels = c(1, 0), 
                                       labels = c("Yes", "No"))), 
         dplyr::across(.cols = dplyr::ends_with("_claims"), 
                       .fns = ~ factor(., 
                                       levels = c(1, 0), 
                                       labels = c("Yes", "No"))), 
         dplyr::across(.cols = dplyr::ends_with("_ehr"), 
                       .fns = ~ factor(., 
                                       levels = c(1, 0), 
                                       labels = c("Yes", "No")))
  )

# Good step I always do to ensure consistent, clean naming conventions
data <- data %>% 
  janitor::clean_names()

One more peak

Take another glimpse at the data and note how the values have changed.

dplyr::glimpse(data)
## Rows: 13,101
## Columns: 17
## $ pat_id                  <chr> "05094", "05711", "01013", "11608", "12443", …
## $ age                     <dbl> NA, 48, 26, 59, 61, 34, 54, 59, 50, 40, NA, 6…
## $ sex                     <fct> Female, Female, Female, Male, Female, Male, M…
## $ race_eth                <fct> "Non-Hispanic, White", "Non-Hispanic, White",…
## $ language                <fct> English, English, English, English, Spanish, …
## $ fpl                     <fct> <=138% FPL, <=138% FPL, <=138% FPL, <=138% FP…
## $ age_f                   <fct> NA, 35-<50, 19-<34, 51-<64, 51-<64, 19-<34, 5…
## $ breast_eligibility      <fct> Yes, Yes, No, No, Yes, No, No, Yes, No, No, N…
## $ flu_eligibility         <fct> Yes, No, No, Yes, Yes, No, Yes, Yes, No, No, …
## $ cholesterol_eligibility <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, …
## $ breast_claims           <fct> Yes, No, No, No, No, No, No, Yes, No, No, No,…
## $ flu_claims              <fct> No, No, No, Yes, No, No, Yes, No, No, No, No,…
## $ cholesterol_claims      <fct> Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, …
## $ breast_ehr              <fct> Yes, No, No, No, No, No, No, No, No, No, No, …
## $ flu_ehr                 <fct> No, No, Yes, Yes, No, Yes, Yes, No, No, No, N…
## $ cholesterol_ehr         <fct> Yes, No, No, Yes, No, Yes, Yes, No, No, No, Y…
## $ cholesterol_total       <dbl> 152.9439, NA, NA, 193.5783, NA, 226.7804, 244…

Skim the data

skimr::skim() is a nice alternative to the base R function summary(). Though it doesn’t always play nicely depending on your operating system…

# With spark graphs if system allows
# skimr::skim(data)

# Without spark graphs
skimr::skim_without_charts(data)
Data summary
Name data
Number of rows 13101
Number of columns 17
_______________________
Column type frequency:
character 1
factor 14
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
pat_id 0 1 5 5 0 13101 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 1313 0.90 FALSE 2 Fem: 7759, Mal: 4029
race_eth 551 0.96 FALSE 4 Non: 8897, Non: 1406, His: 1226, Non: 1021
language 0 1.00 FALSE 3 Eng: 10948, Oth: 1523, Spa: 630
fpl 2813 0.79 FALSE 2 <=1: 10062, >13: 226
age_f 1947 0.85 FALSE 3 19-: 4306, 35-: 4095, 51-: 2753
breast_eligibility 0 1.00 FALSE 2 No: 8845, Yes: 4256
flu_eligibility 0 1.00 FALSE 2 No: 9424, Yes: 3677
cholesterol_eligibility 0 1.00 FALSE 2 Yes: 12797, No: 304
breast_claims 0 1.00 FALSE 2 No: 11467, Yes: 1634
flu_claims 0 1.00 FALSE 2 No: 11856, Yes: 1245
cholesterol_claims 0 1.00 FALSE 2 No: 7775, Yes: 5326
breast_ehr 0 1.00 FALSE 2 No: 11550, Yes: 1551
flu_ehr 0 1.00 FALSE 2 No: 8789, Yes: 4312
cholesterol_ehr 0 1.00 FALSE 2 No: 8042, Yes: 5059

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
age 1947 0.85 39.68 12.69 19.00 29.00 39.00 49.0 64.00
cholesterol_total 8042 0.39 199.63 42.90 47.23 170.67 199.48 227.7 353.94

Exploratory data analysis (EDA)

One of the most important and involved steps in performing multiple imputation. See slide deck on performing EDA of data with missingness.

Multiple imputation in R with mice package

The aims of this session are to

Set the seed for reproducibility

Set the seed to ensure that all results, figures, etc. are reproducible.

seed_for_imp <- 8675309
set.seed(seed_for_imp)

Set the number of iterations and the number of imputations

Tune the imputation using the default settings to begin with.

The mice algorithm tends to converge rather quickly. Advice is to set the iterations high enough that you see convergence but not so high that it will drastically increase the computation time.

Once the imputation model is specified and convergence issues are addressed, increase the number of imputations for the final model. Allison suggests: the number of imputations should be similar to the percentage of cases that are incomplete. This can also make for long computation time, which is an appropriate price to pay for imputation with large missing.

In our data set the percentage of cases that are incomplete was:

data %>% 
  naniar::miss_prop_summary()
## # A tibble: 1 x 3
##       df   var  case
##    <dbl> <dbl> <dbl>
## 1 0.0746 0.353 0.785

Stick with the defaults for now.

imp_num <- 5   # number of imputations, dflt = 5
iter_num <- 5  # number of iterations, dflt = 5

Naive first pass

Start with an “initial” empty imputation to see what the software does by default. Also, this provides some objects to help in setting up the final imputation model.

See what is contained in the init object that contains the imputations and data about the setup.

init <- mice::mice(data, maxit = 0)
## Warning: Number of logged events: 1
names(init)
##  [1] "data"            "imp"             "m"               "where"          
##  [5] "blocks"          "call"            "nmis"            "method"         
##  [9] "predictorMatrix" "visitSequence"   "formulas"        "post"           
## [13] "blots"           "ignore"          "seed"            "iteration"      
## [17] "lastSeedValue"   "chainMean"       "chainVar"        "loggedEvents"   
## [21] "version"         "date"

We already see a warning which is due to potential collinearity between variables in our data set: age and age_f.

What is in the imputation object

We are most interested in the methods, predictorMatrix, and the visitSequence which we will work with and adjust to help set up the final imputation models.

meth <- init$method
pred_mat <- init$predictorMatrix
visit_order <- init$visitSequence

methods

These are the default model specifications that the software chose based on data type.

meth[meth != ""]  # shown here as subset of those to be imputed
##               age               sex          race_eth               fpl 
##             "pmm"          "logreg"         "polyreg"          "logreg" 
##             age_f cholesterol_total 
##         "polyreg"             "pmm"

We can see the defaults that the package chooses for the variables with missing data:

See link to Van Buuren’s text online for all available.

A note on predictive mean matching (pmm)

According to Van Buuren: pmm is robust to transformation of target variables and it can handle discrete variables. pmm will not impute outside of the observed data range.

Predictive mean matching calculates the predicted value of target variable Y according to the specified imputation model. For each missing entry, the method forms a small set of candidate donors (typically with 3, 5 or 10 members) from all complete cases that have predicted values closest to the predicted value for the missing entry. One donor is randomly drawn from the candidates, and the observed value of the donor is taken to replace the missing value. The assumption is the distribution of the missing cell is the same as the observed data of the candidate donors. (Van Buuren, Flexible Imputation of Missing Data)

predictorMatrix

pred_mat %>% 
  DT::datatable()

visitSequence

We can also check the visit sequence to see the order that variables are “visited” in the imputation process. Default is the order of the variables in the data frame.

visit_order
##  [1] "pat_id"                  "age"                    
##  [3] "sex"                     "race_eth"               
##  [5] "language"                "fpl"                    
##  [7] "age_f"                   "breast_eligibility"     
##  [9] "flu_eligibility"         "cholesterol_eligibility"
## [11] "breast_claims"           "flu_claims"             
## [13] "cholesterol_claims"      "breast_ehr"             
## [15] "flu_ehr"                 "cholesterol_ehr"        
## [17] "cholesterol_total"

Specify the imputation model

In Van Buuren’s vignette and book, he lays out steps to follow when setting up multiple imputation.

Here we will go through the process with our example data to

Step 1 - Decide if the missing at random (MAR) assumption is reasonable

Assuming MAR is typically a reasonable place to start. There is literature on sensitivity analysis with the imputations to examine if this assumption is met. And there are techniques to model the missing mechanism with the imputation if there is violation. This work is outside the scope of what I hope to share here.

Exploring the missingness is so important for this first step!

Step 2 - Decide on the form of the imputation model

We want to decide the form of the model used to impute the missing values of each variable. We saved the default choices from before.

meth[meth != ""]
##               age               sex          race_eth               fpl 
##             "pmm"          "logreg"         "polyreg"          "logreg" 
##             age_f cholesterol_total 
##         "polyreg"             "pmm"

These look pretty reasonable given what we know of our data set. We could change the settings to the meth object if we liked:

meth[c("age")] <- "norm.nob"
meth[meth != ""]
##               age               sex          race_eth               fpl 
##        "norm.nob"          "logreg"         "polyreg"          "logreg" 
##             age_f cholesterol_total 
##         "polyreg"             "pmm"

Change it back before proceeding since we want to use predictive mean matching for continuous data.

meth[c("age")] <- "pmm"
meth[meth != ""]
##               age               sex          race_eth               fpl 
##             "pmm"          "logreg"         "polyreg"          "logreg" 
##             age_f cholesterol_total 
##         "polyreg"             "pmm"

Step 3 - Decide the set of predictors to include in the imputation model

What variables to include in the multiple imputation model?

The advice is to include as many relevant variables as possible. One should include all variables that are in your scientific model of interest that will be used after imputation. Also variables that are related to the missingness of the variables you are imputing (i.e. those known to be related to nonresponse). Van Buuren has more advice on this and worth reading, vignette and book

Including as many predictors as possible makes the MAR assumption more reasonable. But with larger data sets, this is not advisable for computation purposes. Van Buuren suggests that 15 to 25 variables will work well. He also offers advice to cull that list.

Discuss with collaborators and lead researchers with domain knowledge of your data

Step 3 - Decide the set of predictors to include in the imputation model

I came across these from some class notes online:

The imputation model should include variables that are:

The model should be general enough to preserve any associations among variables (two-, three-, or higher- way associations) that may be of interest in later analyses.

Step 3 - Decide the set of predictors to include in the imputation model

In our case, we are on our own. To aid in these decisions the mice package has a function that produces a “quick predictor matrix” that is useful for dealing with data sets with large number of variables. The software chooses by calculating two correlations with the available cases, taking the larger, and seeing if it meets a minimum threshold. Type ?mice::quickpred in the R console for better description.

Below I run the quickpred() to see what the software chooses. I just show the rows and columns

pred_guess <- data %>% 
  mice::quickpred()


pred_guess[rowSums(pred_guess) > 0, colSums(pred_guess) > 0] %>% 
  DT::datatable()

In practice

In my work, I’ve used a combination of the predictor matrix and collaborators input to select predictors for the imputation model.

Note that the models can have different sets of predictors.

From the help details section for mice::quickpred:

The function is designed to aid in setting up a good imputation model for data with many variables.

Basic workings: The procedure calculates for each variable pair (i.e. target-predictor pair) two correlations using all available cases per pair. The first correlation uses the values of the target and the predictor directly. The second correlation uses the (binary) response indicator of the target and the values of the predictor. If the largest (in absolute value) of these correlations exceeds mincor, the predictor will be added to the imputation set. The default value for mincor is 0.1.

In addition, the procedure eliminates predictors whose proportion of usable cases fails to meet the minimum specified by minpuc. The default value is 0, so predictors are retained even if they have no usable case.

Finally, the procedure includes any predictors named in the include argument (which is useful for background variables like age and sex) and eliminates any predictor named in the exclude argument. If a variable is listed in both include and exclude arguments, the include argument takes precedence.

Check that the included variables make sense

Ensure that patient ID (pat_id) doesn’t get included as a predictor in any of the models.

pred_guess[, "pat_id"] <- 0

pred_guess[rowSums(pred_guess) > 0, colSums(pred_guess) > 0] %>% 
  DT::datatable()

Include the variables that looked good in our EDA

In the slides on EDA we saw that:

pred_guess["age", c("race_eth")] <- 1
pred_guess["sex", c("race_eth")] <- 1
pred_guess["race_eth", c("age", "sex")] <- 1
pred_guess["race_eth", c("age", "sex")] <- 1
pred_guess["fpl", c("age", "sex", "race_eth")] <- 1

pred_guess[rowSums(pred_guess) > 0, colSums(pred_guess) > 0] %>%
  DT::datatable()

Step 4 - Decide how to impute variables that are functions of other (incomplete) variables

Transformations, sum scores, etc. were not used in this data set so not much to consider here. In some cases, there can be a lot to think about particularly if a variable is transformed solely to meet the normality assumption in a regression model. So do a literature review if this issue applies to you.

Need to handle the derived variable age_f which is calculated from age. See chapter in Van Buuren text for a few different methods. Here I will use “passive imputation” to impute age_f as a derived variable.

# User defined function to create age_f, a categorical version of the continuous
# age variable
calc_age_f <- function(x) { 
  cut(x, 
      breaks = c(19, 35, 50, 65), 
      right = FALSE, 
      labels = c('19-<34', '35-<50', '51-<64'))
}

meth["age_f"] <- "~I(calc_age_f(age))"
meth[meth != ""]
##                   age                   sex              race_eth 
##                 "pmm"              "logreg"             "polyreg" 
##                   fpl                 age_f     cholesterol_total 
##              "logreg" "~I(calc_age_f(age))"                 "pmm"

Revisit the predictor matrix

Since it’s based on other variables in the data, there will be collinearity in the imputation model when age_f is included. The software should detect this automatically, give a warning, and drop the variable automatically. Best to ensure that it is not included in the imputation model anyways.

I also don’t want to impute cholesterol_total or include it in any of my imputation models.

# Exclude as predictors in all models
pred_guess[, "age_f"] <- 0  
pred_guess[, "cholesterol_total"] <- 0

# Exclude from being imputed
pred_guess["cholesterol_total", ] <- 0

pred_guess[rowSums(pred_guess) > 0, colSums(pred_guess) > 0] %>% 
  DT::datatable()

Step 5 - Decide the order to impute the variables

The default in the software goes by appearance in the data set left to right. It can be overwritten per the user’s direction. This becomes more of an issue if there is a longitudinal nature to the data set where missingness at an earlier time point would affect the missingness later. So impute early to later.

I will examine the imputation order by magnitude of missingness (low to high and high to low). To see if there is a difference in performance or convergence or any impact to the estimates. I usually default to imputing from highest percent missing to lowest.

visit_order
##  [1] "pat_id"                  "age"                    
##  [3] "sex"                     "race_eth"               
##  [5] "language"                "fpl"                    
##  [7] "age_f"                   "breast_eligibility"     
##  [9] "flu_eligibility"         "cholesterol_eligibility"
## [11] "breast_claims"           "flu_claims"             
## [13] "cholesterol_claims"      "breast_ehr"             
## [15] "flu_ehr"                 "cholesterol_ehr"        
## [17] "cholesterol_total"

Override this by ordering based on magnitude of missing that we got from the EDA

data %>% 
  naniar::miss_var_summary() %>% 
  dplyr::filter(n_miss > 0)
## # A tibble: 6 x 3
##   variable          n_miss pct_miss
##   <chr>              <int>    <dbl>
## 1 cholesterol_total   8042    61.4 
## 2 fpl                 2813    21.5 
## 3 age                 1947    14.9 
## 4 age_f               1947    14.9 
## 5 sex                 1313    10.0 
## 6 race_eth             551     4.21
visit_order <- c("fpl", 
                 "age", 
                 "age_f", 
                 "sex", 
                 "race_eth")
visit_order
## [1] "fpl"      "age"      "age_f"    "sex"      "race_eth"

Step 6 - Decide the number of iterations

This is to ensure convergence. The default is 5. 10 to 20 are recommended.

iter_num
## [1] 5

Step 7 - Decide on the number of multiply imputed data sets

The rule of thumb from more recent authors is that the number of imputations should be similar to the percentage of cases (observations) that are incomplete (at least 5).

The software default is 5 and I typically use that when running for the first time and when checking convergence to save time.

We had set this previously above.

imp_num
## [1] 5

Run the algorithm

All the setup work has been done and considerations made. Using the specifications that saved in objects above, we will run the mice command to impute the data sets.

imp <- mice::mice(data = data, 
                  m = imp_num,             # number of imputations, dflt = 5
                  method = meth,           # specify the method
                  predictorMatrix = pred_guess, 
                  visitSequence = visit_order, 
                  seed = seed_for_imp, 
                  maxit = iter_num,        # number of iterations, dflt = 5
                  print = FALSE)

Check for any issues or warnings

imp$loggedEvents
## NULL

Check convergence

We plot the values of mean and standard deviation for each variable by number of iteration. We want to look for mixing of the lines and we do not want to see any trend.

I tend to think about issues with convergence similar to those that come up with any kind of modeling specifically combinations of categorical predictors that cause (almost perfect) data separation.

plot(imp, c("fpl", 
            "age", 
            "age_f", 
            "sex", 
            "race_eth"))

Extend the number of iterations and check again

Hard to tell from just 5 iterations. But checking above there was nothing that immediately jumped out. Let’s run for more iterations to make sure and see when things start to settle down if at all.

imp40 <- mice::mice.mids(imp, 
                         maxit = 35, 
                         print = FALSE)


plot(imp40, c("fpl", 
              "age", 
              "age_f", 
              "sex", 
              "race_eth"))

Diagnostics

See Van Buuren chapter 6.6. The mice package contains a few functions to help graphically assess the differences between the observed and imputed data.

Discrepancies may be due to the imputation model, missingness assumption, or both.

For continous variables

Best for small data sets

stripplot(imp, age)

Better for larger data sets

bwplot(imp, age)

densityplot(imp, ~ age)

Categorical variables

Not a lot out there for categorical variables. It’s not ideal but you can use the densityplot function on categorical variables to get and idea..

densityplot(imp, ~ sex)

I’ve made plots on my own which gets a little hard to explain. The general idea is to

# Add a column to the data to indicate if there were missing values for sex
missing_sex <- data %>% 
  dplyr::select(pat_id, sex) %>% 
  naniar::add_any_miss(data = ., 
                       sex, 
                       missing = "imputed", 
                       complete = "observed")

# Now have an indicator for missing age. 

# produces a data set where imputed data sets are stacked vertically. 
imp_long <- mice::complete(imp, "long")

# Join the missing indicator to the long data
imp_long <- missing_sex %>% 
  dplyr::select(pat_id, any_miss_vars) %>% 
  dplyr::left_join(., 
                   imp_long, 
                   by = "pat_id")


# Similar to the strip plot idea
ggplot() + 
  geom_jitter(data = imp_long %>% dplyr::filter(any_miss_vars == "observed"), 
              aes(x = .imp, 
                  y = sex), 
              width = 0.10, 
              height = 0.25, 
              colour = "goldenrod", 
              alpha = 0.4) + 
  geom_jitter(data = imp_long %>% dplyr::filter(any_miss_vars == "imputed"), 
              aes(x = .imp, 
                  y = sex), 
              width = 0.10, 
              height = 0.25, 
              colour = "darkblue", 
              alpha = 0.7)

# Plotted another way

ggplot(data = imp_long, 
       aes(x = sex)) + 
  geom_bar(stat = "count") + 
  facet_grid(.imp ~ any_miss_vars)

Final imputation model

Once you are settled on the imputation model and reviewed the process for convergence, you are ready to create the multiple imputations. Double check your inputs, increase the number of imputations to perform.

imp_final <- mice::mice(data = data, 
                        m = 40,             # number of imputations, dflt = 5
                        method = meth,           # specify the method
                        predictorMatrix = pred_guess, 
                        visitSequence = visit_order, 
                        seed = seed_for_imp, 
                        maxit = 20,        # number of iterations, dflt = 5
                        print = FALSE)
#    user  system elapsed 
# 450.477  61.924 515.250

Save the imputation results

Save the results to avoid having to re-run the multiple imputations.

save(imp_final,  
     file = here::here("data", 
                       "imp-data", 
                       "imp_final.rda"))

Re-load the imputation results

We will use final imputation data imp_final for the rest of the slides.

load(file = here::here("data", 
                       "imp-data", 
                       "imp_final.rda"))

plot(imp_final)

Analyzing imputed data sets

Rubin’s Rules for pooling were covered in Miguel’s slides.

Also covered in Van Buuren: vignette and book

Steps to pool by hand with mice

  1. Figure out how to obtain the estimated statistic of interest and it’s standard error.
    • I often end up writing a function to get at these cleanly.
    • Test it on the complete data.
  2. Loop over each imputed data sets.
    • For each imputation, obtain the estimate and standard error
  3. Combine using Rubin’s rules.

Analyzing imputed data sets

Here we will look at:

Kappa

Here we are interested in the agreement between the claims data and the EHR data.

We did not impute the claims and EHR, but we did impute demographics. So let’s compare stratified agreement with complete case data and with multiply imputed data.

Agreement along the diagonal: Both sources say “Yes” or “No”. Kappa is considered “chance-adjusted” agreement.

Chosen here because it was of interest with the original study, but, also, because we will not transform kappa before pooling: simpler example.

tab <- data %>% 
  dplyr::filter(sex == "Female") %>% 
  with(., table(breast_claims, 
                breast_ehr))

tab
##              breast_ehr
## breast_claims  Yes   No
##           Yes  657  639
##           No   569 5894

Kappa from complete case

complete_kappa_res <- tab %>% 
  psych::cohen.kappa() %>% 
  broom::tidy() %>% 
  janitor::clean_names() %>% 
  dplyr::filter(type == "unweighted") %>% 
  dplyr::select(kappa = estimate, 
                lower_ci = conf_low, 
                upper_ci = conf_high)

complete_kappa_res
## # A tibble: 1 x 3
##   kappa lower_ci upper_ci
##   <dbl>    <dbl>    <dbl>
## 1 0.428    0.401    0.455

Kappa from multiple impuation

Need to get the estimate and standard error for the statistic of interest from each imputed data set.

I’ve found that it’s best to create a wrapper function and test it on the original data first

# Wrapper function to 
# Filter the data set
# Calculate Kappa and the asymptotic SE
calc_kappa <- function(data, x, y) { 
  
  data_filtered <- data %>% 
    dplyr::filter(sex == "Female")
  
  tab <- table(data_filtered[[x]], data_filtered[[y]])
  
  k_res <- psych::cohen.kappa(x = tab)
  
  tibble::tibble(n = k_res$n.obs, 
                 kappa = k_res$kappa, 
                 se = sqrt(k_res$var.kappa) # Confirmed correct with vcd package
  )
  
}
calc_kappa(data = data, 
           x = "breast_claims", 
           y = "breast_ehr")
## # A tibble: 1 x 3
##       n kappa     se
##   <int> <dbl>  <dbl>
## 1  7759 0.428 0.0137

Kappa from multiple impuation

Obtain the estimate and standard error for each imputed data set

# Set up for pooling
m <- imp_final$m
Q <- rep(NA, m)
U <- rep(NA, m)

for (i in 1:m) {
  
  kappa_res <- calc_kappa(data = complete(imp_final, i), 
                          x = "breast_claims", 
                          y = "breast_ehr")
  
  Q[i] <- kappa_res$kappa
  
  U[i] <- (kappa_res$se) ^ 2         # (standard error of estimate)^2
}
Q
##  [1] 0.4438936 0.4443171 0.4439941 0.4440877 0.4440158 0.4440877 0.4439726
##  [8] 0.4439870 0.4440014 0.4440302 0.4440903 0.4441021 0.4437642 0.4442169
## [15] 0.4441164 0.4446541 0.4445511 0.4440014 0.4445356 0.4440302 0.4438936
## [22] 0.4442488 0.4448959 0.4444211 0.4440733 0.4442985 0.4441595 0.4437561
## [29] 0.4439438 0.4439582 0.4446087 0.4440589 0.4440014 0.4440514 0.4436344
## [36] 0.4440080 0.4443680 0.4444067 0.4444827 0.4440014
U
##  [1] 0.0001517263 0.0001516603 0.0001516851 0.0001517544 0.0001517839
##  [6] 0.0001517544 0.0001518016 0.0001517957 0.0001517898 0.0001517780
## [11] 0.0001518658 0.0001517485 0.0001517794 0.0001517014 0.0001517426
## [16] 0.0001517467 0.0001518922 0.0001517898 0.0001516829 0.0001517780
## [21] 0.0001517263 0.0001518007 0.0001517505 0.0001517300 0.0001517603
## [26] 0.0001516726 0.0001517250 0.0001518905 0.0001518135 0.0001518075
## [31] 0.0001518685 0.0001517662 0.0001517898 0.0001516616 0.0001518327
## [36] 0.0001518949 0.0001517470 0.0001517358 0.0001517000 0.0001517898

Another way to loop

# A more "tidyverse-y" way
imp_final %>% 
  mice::complete("all") %>% 
  purrr::map_dfr(.x = ., 
                 .f = ~ calc_kappa(data = ., 
                                   x = "breast_claims", 
                                   y = "breast_ehr"), 
                 .id = "m") %>% 
  mutate(Q = kappa, 
         U = se ^ 2) 
## # A tibble: 40 x 6
##    m         n kappa     se     Q        U
##    <chr> <int> <dbl>  <dbl> <dbl>    <dbl>
##  1 1      8678 0.444 0.0123 0.444 0.000152
##  2 2      8690 0.444 0.0123 0.444 0.000152
##  3 3      8685 0.444 0.0123 0.444 0.000152
##  4 4      8674 0.444 0.0123 0.444 0.000152
##  5 5      8669 0.444 0.0123 0.444 0.000152
##  6 6      8674 0.444 0.0123 0.444 0.000152
##  7 7      8666 0.444 0.0123 0.444 0.000152
##  8 8      8667 0.444 0.0123 0.444 0.000152
##  9 9      8668 0.444 0.0123 0.444 0.000152
## 10 10     8670 0.444 0.0123 0.444 0.000152
## # … with 30 more rows

mice::pool.scalar

Pass the estimates and standard errors to mice::pool.scalar

pooled_est <- mice::pool.scalar(Q, U, n = nrow(nhanes), k = 1)  # Barnard-Rubin 1999
# Estimated pooled Kappa
pooled_est$qbar
## [1] 0.444143
# 95% Confidence interval
pooled_est$qbar + c(-1.96, 1.96) * sqrt(pooled_est$t)
## [1] 0.4199912 0.4682948

From the documentation, all the components

Here we are only interested in a few of the components.

Per ?mice::pool.scalar, returns a list with components:

Compare with complete case analysis

combined_kappa_res <- tibble::tibble(kappa = pooled_est$qbar, 
                                     lower_ci = pooled_est$qbar + -1.96 * sqrt(pooled_est$t), 
                                     upper_ci = pooled_est$qbar + 1.96 * sqrt(pooled_est$t)) %>% 
  dplyr::bind_rows(., 
                   complete_kappa_res) %>% 
  mutate(scenario = c("mi", "complete"))

combined_kappa_res
## # A tibble: 2 x 4
##   kappa lower_ci upper_ci scenario
##   <dbl>    <dbl>    <dbl> <chr>   
## 1 0.444    0.420    0.468 mi      
## 2 0.428    0.401    0.455 complete
ggplot(data = combined_kappa_res, 
       aes(x = kappa, 
           y = scenario)) + 
  geom_errorbar(aes(xmin = lower_ci, xmax = upper_ci), 
                width = 0.05) + 
  geom_point(size = 2) + 
  scale_x_continuous(limits = c(0, 1))

Even more by hand

Point estimate

\[\bar{\theta} = \frac{1}{m} \sum_{m=1}^{m} \hat{\theta}_{m}\]

We have all the estimates that we need to do this by hand using Rubin’s Rules

pooled_est$m
## [1] 40
pooled_est$qhat
##  [1] 0.4438936 0.4443171 0.4439941 0.4440877 0.4440158 0.4440877 0.4439726
##  [8] 0.4439870 0.4440014 0.4440302 0.4440903 0.4441021 0.4437642 0.4442169
## [15] 0.4441164 0.4446541 0.4445511 0.4440014 0.4445356 0.4440302 0.4438936
## [22] 0.4442488 0.4448959 0.4444211 0.4440733 0.4442985 0.4441595 0.4437561
## [29] 0.4439438 0.4439582 0.4446087 0.4440589 0.4440014 0.4440514 0.4436344
## [36] 0.4440080 0.4443680 0.4444067 0.4444827 0.4440014
est <- (1 / pooled_est$m) * sum(pooled_est$qhat)
est
## [1] 0.444143

This is the same as we saw above

pooled_est$qbar
## [1] 0.444143

Even more by hand

Within variance

\[\bar{V} = \frac{1}{m} \sum_{m=1}^{m} V_{m}\]

pooled_est$u
##  [1] 0.0001517263 0.0001516603 0.0001516851 0.0001517544 0.0001517839
##  [6] 0.0001517544 0.0001518016 0.0001517957 0.0001517898 0.0001517780
## [11] 0.0001518658 0.0001517485 0.0001517794 0.0001517014 0.0001517426
## [16] 0.0001517467 0.0001518922 0.0001517898 0.0001516829 0.0001517780
## [21] 0.0001517263 0.0001518007 0.0001517505 0.0001517300 0.0001517603
## [26] 0.0001516726 0.0001517250 0.0001518905 0.0001518135 0.0001518075
## [31] 0.0001518685 0.0001517662 0.0001517898 0.0001516616 0.0001518327
## [36] 0.0001518949 0.0001517470 0.0001517358 0.0001517000 0.0001517898
v_w <- (1 / pooled_est$m) * sum(pooled_est$u)
v_w
## [1] 0.000151768
pooled_est$ubar
## [1] 0.000151768

Between variance

\[B = \frac{1}{m - 1} \sum_{m=1}^{m} {(\hat{\theta}_{m} - \bar{\theta})} ^ 2\]

v_b <- (1 / (pooled_est$m - 1)) * sum((pooled_est$qhat - pooled_est$qbar) ^ 2)
v_b
## [1] 7.044579e-08
pooled_est$b
## [1] 7.044579e-08

Total variance

\[T = \bar{V} + (1 + {m}^{-1})B\]

v_w + (1 + (1 / pooled_est$m)) * v_b
## [1] 0.0001518402
pooled_est$t
## [1] 0.0001518402

Odds ratio by hand

Trick with odds ratio is to calculate the log-odds, then pool, then exponentiate.

First write a function

# Simple function to calculate the log(odds ratio) and approximate standard error
calc_ln_or <- function(data, x, y) { 
  
  tab <- table(data[[x]], data[[y]])
  
  n_11 <- tab[1, 1]
  n_12 <- tab[1, 2]
  n_21 <- tab[2, 1]
  n_22 <- tab[2, 2]
  
  # or <- (n_11 * n_22) / (n_12 * n_21)
  # Flipping the odds ratio to have the reference levels match
  or <- (n_12 * n_21) / (n_11 * n_22)
  ln_or <- log(or)
  
  # approximate standard error for ln(or)
  se_ln_or <- sqrt((1 / n_11) + (1 / n_12) + (1 / n_21) + (1 / n_22))
  
  tibble::tibble(ln_or = ln_or, 
                 se_ln_or = se_ln_or)
  
}

Test it on the complete data

or_cc <- calc_ln_or(data = data, 
                    x = "sex", 
                    y = "flu_ehr")


# Estimated Odds Ratio
exp(or_cc$ln_or)
## [1] 1.065003
# 95% Confidence interval
exp(or_cc$ln_or + c(-1.96, 1.96) * or_cc$se_ln_or)
## [1] 0.9815445 1.1555579

Loop over each imputed data set

# Set up for pooling
m <- imp_final$m
Q <- rep(NA, m)
U <- rep(NA, m)

for (i in 1:m) {
  
  ln_or_res <- calc_ln_or(data = complete(imp_final, i), 
                          x = "sex", 
                          y = "flu_ehr")
  
  
  Q[i] <- ln_or_res$ln_or
  
  U[i] <- (ln_or_res$se_ln_or) ^ 2         # (standard error of estimate)^2
}

Combine using Rubin’s Rules and mice::pool.scalar

pooled_est <- mice::pool.scalar(Q, U, n = nrow(nhanes), k = 1)  # Barnard-Rubin 1999

pooled_est
## $m
## [1] 40
## 
## $qhat
##  [1] 0.04354367 0.04660435 0.03017746 0.03842249 0.02817529 0.03226535
##  [7] 0.03587783 0.02715582 0.04304835 0.04099474 0.04050883 0.04354876
## [13] 0.03433171 0.04148236 0.03944401 0.04405491 0.04051502 0.03843606
## [19] 0.03479434 0.03176402 0.03738711 0.03535999 0.04354367 0.03532888
## [25] 0.04252689 0.02759346 0.03943759 0.02362538 0.04100668 0.03382990
## [31] 0.05329689 0.04355386 0.02920490 0.03993000 0.04203866 0.03130897
## [37] 0.05789426 0.04097694 0.03583219 0.03535999
## 
## $u
##  [1] 0.001538653 0.001540183 0.001541970 0.001538795 0.001539600 0.001539779
##  [7] 0.001537865 0.001539430 0.001537066 0.001537721 0.001535643 0.001538152
## [13] 0.001538611 0.001539817 0.001538967 0.001538740 0.001535148 0.001537792
## [19] 0.001541730 0.001539189 0.001539629 0.001538280 0.001538653 0.001540295
## [25] 0.001537979 0.001543070 0.001539471 0.001537334 0.001536723 0.001538024
## [31] 0.001533831 0.001537651 0.001539267 0.001541074 0.001535899 0.001536104
## [37] 0.001536093 0.001539226 0.001540887 0.001538280
## 
## $qbar
## [1] 0.03810454
## 
## $ubar
## [1] 0.001538566
## 
## $b
## [1] 4.721021e-05
## 
## $t
## [1] 0.001586956
## 
## $df
## [1] 21.53355
## 
## $r
## [1] 0.03145167
## 
## $fmi
## [1] 0.1095279

Extra step

Transform the estimates (in this case, exponentiate)

# Estimated pooled Odds Ratio
exp(pooled_est$qbar)
## [1] 1.03884
# 95% Confidence interval
exp(pooled_est$qbar + c(-1.96, 1.96) * sqrt(pooled_est$t))
## [1] 0.9608132 1.1232029

Odds ratio with logistic regression

Complete case

or_lr_cc <- glm((flu_ehr == "Yes") ~ sex, 
                family = binomial(link = "logit"), 
                data = data) %>% 
  broom::tidy(conf.int = TRUE, exponentiate = TRUE)

or_lr_cc
## # A tibble: 2 x 7
##   term        estimate std.error statistic   p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)    0.451    0.0245    -32.4  8.72e-231    0.430     0.474
## 2 sexMale        1.07     0.0416      1.51 1.30e-  1    0.981     1.16

Multiply-imputed data

# Fit model for each imputed data set
or_lr_mi_fit <- with(imp_final, 
                     glm((flu_ehr == "Yes") ~ sex, 
                         family = binomial(link = "logit")))

# Pool the results with mice
or_lr_mi <- mice::pool(or_lr_mi_fit)

# Clean up the results
or_lr_mi <- summary(or_lr_mi, conf.int = TRUE, exponentiate = TRUE) 
or_lr_mi
##          term  estimate  std.error   statistic        df  p.value     2.5 %
## 1 (Intercept) 0.4843068 0.02302561 -31.4882767 12492.251 0.000000 0.4629342
## 2     sexMale 1.0388398 0.03983661   0.9565207  9746.982 0.338833 0.9608053
##      97.5 %
## 1 0.5066662
## 2 1.1232121

Compare results

Not the prettiest way to get there… Here, from each scenario, I pull out the odds ratios and confidence intervals for visualization.

tibble::tribble(
  ~scenario,                         ~or,                                          ~lower_ci,                                          ~upper_ci,
  "Complete case, 2x2 table",            exp(or_cc$ln_or),     exp(or_cc$ln_or - 1.96 * or_cc$se_ln_or),     exp(or_cc$ln_or + 1.96 * or_cc$se_ln_or),
  "MI, 2x2 table",        exp(pooled_est$qbar),   exp(pooled_est$qbar - 1.96 * sqrt(pooled_est$t)),   exp(pooled_est$qbar + 1.96 * sqrt(pooled_est$t)),
  "Complete case, logistic reg",     or_lr_cc[2, "estimate"][[1]],                            or_lr_cc[2, "conf.low"][[1]],                           or_lr_cc[2, "conf.high"][[1]],
  "MI, logistic reg",     or_lr_mi[2, "estimate"],                               or_lr_mi[2, "2.5 %"],                              or_lr_mi[2, "97.5 %"]
) %>% 
  ggplot(data = ., 
         aes(x = or, 
             y = scenario)) + 
  geom_errorbar(aes(xmin = lower_ci, xmax = upper_ci), 
                width = 0.05) + 
  geom_point(size = 2) + 
  scale_x_continuous(trans = "log10", 
                     breaks = seq(0, 5, 1), 
                     limits = c(0.1, 5)) +
  labs(x = "Odds ratio (log scale)",
       y = "scenario")